library(dplyr)
library(forcats)
library(ggplot2)
library(plotly)
library(broom)
library(boot)
library(caret)
library(pROC)Project Part 2
Group 9
Loading libraries
Loading dataset
dataset <- read.csv("SMARTc.csv", sep = ";") # Without missing valuesRe-encode the categorical variables
dataset <- mutate(dataset,
EVENT = factor(EVENT),
EVENT = fct_recode(EVENT, "no" = "0", "yes" = "1"),
SEX = factor(SEX),
SEX = fct_recode(SEX, "male" = "1", "female" = "2"),
DIABETES = factor(DIABETES),
DIABETES = fct_recode(DIABETES, "no" = "0", "yes" = "1"),
SMOKING = factor(SMOKING),
SMOKING = fct_recode(SMOKING, "never" = "1", "former" = "2", "current" = "3"),
alcohol = factor(alcohol),
alcohol = fct_recode(alcohol, "never" = "1", "former" = "2", "current" = "3"),
CEREBRAL = factor(CEREBRAL),
CEREBRAL = fct_recode(CEREBRAL, "no" = "0", "yes" = "1"),
CARDIAC = factor(CARDIAC),
CARDIAC = fct_recode(CARDIAC, "no" = "0", "yes" = "1"),
AAA = factor(AAA),
AAA = fct_recode(AAA, "no" = "0", "yes" = "1"),
PERIPH = factor(PERIPH),
PERIPH = fct_recode(PERIPH, "no" = "0", "yes" = "1"),
albumin = factor(albumin),
albumin = fct_recode(albumin, "no" = "1", "micro" = "2", "macro" = "3"),
STENOSIS = factor(STENOSIS),
STENOSIS = fct_recode(STENOSIS, "no" = "0", "yes" = "1"),
)Logistic regression model of EVENT
Accessing performance using cross-validation
k <- 5
folds <- createFolds(dataset$EVENT, k = k, list = TRUE, returnTrain = FALSE)
roc_list <- list()
for (i in 1:k) {
train <- dataset[-folds[[i]], ]
test <- dataset[folds[[i]], ]
fit_train <- glm(EVENT ~ AGE + SEX + BMI + SYSTH + HDL + DIABETES +
HISTCAR2 + HOMOC + log(CREAT) + STENOSIS + IMT + SMOKING +
alcohol + albumin, data = train, family = "binomial")
predict_test <- predict(fit_train, newdata = test, type = "response")
roc_i <- roc(test$EVENT, predict_test)
roc_list[[i]] <- roc_i
}display code to plot the ROC curves
roc_df <- do.call(rbind, lapply(1:length(roc_list), function(i) {
data.frame(
Fold = paste("Fold", i),
Sensitivity = roc_list[[i]]$sensitivities,
Specificity = 1 - roc_list[[i]]$specificities
)
}))
roc_plot <- ggplot(roc_df, aes(x = Specificity, y = Sensitivity, color = Fold)) +
geom_line(linewidth = 0.8) +
scale_color_brewer(palette = "Set1") +
theme_minimal(base_size = 14) +
labs(
title = "ROC Curves for Each Fold",
x = "1 - Specificity",
y = "Sensitivity",
color = "Fold"
) +
theme(
plot.title = element_text(hjust = 0.5, size = 15, face = "bold"),
legend.position = "bottom"
) +
coord_equal()
ggplotly(roc_plot)